(special version) \(Let\;A\in\mathbb{R}^{n\times n},\;v'\;is\;column\;vector\;in\;\mathbb{R}^n,\;and\;1-\mathcal{v}A^{-1}\mathcal{v'}\neq0.\;Then\)
\[ \left(A-\mathcal{v'}\mathcal{v}\right)^{-1}=\left(A^{-1}-\cfrac{A^{-1}\mathcal{v'}\mathcal{v}A^{-1}}{1-\mathcal{v}A^{-1}\mathcal{v'}}\right) \]
(leave \(i_{th}\) out)
\[ \hat{\beta_{(i)}}=(\mathbb{X'_{(i)}}\mathbb{X_{(i)}})^{-1}\mathbb{X'_{(i)}}\mathbb{Y_{(i)}}=\left(\mathbb{X'}\mathbb{X}-X_i'X_i\right)^{-1}\left(\mathbb{X'}\mathbb{Y}-X_i'Y_i\right) \]( By Sherman–Morrison formula ) \[ =\left((\mathbb{X'}\mathbb{X})^{-1}+\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'X_i(\mathbb{X'}\mathbb{X})^{-1}}{1-X_i(\mathbb{X'}\mathbb{X})^{-1}X_i'}\right)\left(\mathbb{X'}\mathbb{Y}-X_i'Y_i\right) \]( \(h_{ii}=X_i(\mathbb{X'}\mathbb{X})^{-1}X_i'\) ) \[ =\left((\mathbb{X'}\mathbb{X})^{-1}+\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'X_i(\mathbb{X'}\mathbb{X})^{-1}}{1-h_{ii}}\right)\left(\mathbb{X'}\mathbb{Y}-X_i'Y_i\right) \]( Expend ) \[ =\left[(\mathbb{X'}\mathbb{X})^{-1}\mathbb{X'}\mathbb{Y}\right] -\left[(\mathbb{X'}\mathbb{X})^{-1}X_i'Y_i\right] +\left[\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'X_i(\mathbb{X'}\mathbb{X})^{-1}\mathbb{X'}\mathbb{Y}}{1-h_{ii}}\right] -\left[\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'X_i(\mathbb{X'}\mathbb{X})^{-1}X_i'Y_i}{1-h_{ii}}\right] \]( \(\hat{\beta}=(\mathbb{X'}\mathbb{X})^{-1}\mathbb{X'}\mathbb{Y}\) and \(h_{ii}=X_i(\mathbb{X'}\mathbb{X})^{-1}X_i'\) ) \[ = \left[\hat{\beta}\right] -\left[(\mathbb{X'}\mathbb{X})^{-1}X_i'Y_i\right] +\left[\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'X_i\hat{\beta}}{1-h_{ii}}\right] -\left[\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'h_{ii}Y_i}{1-h_{ii}}\right] \]( Times \(\frac{1-h_{ii}}{1-h_{ii}}\) for \((\mathbb{X'}\mathbb{X})^{-1}X_i'Y_i\) ) \[ = \left[\hat{\beta}\right] -\left[\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'Y_i(1-h_{ii})}{1-h_{ii}}\right] +\left[\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'X_i\hat{\beta}}{1-h_{ii}}\right] -\left[\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'h_{ii}Y_i}{1-h_{ii}}\right] \]( Simplified ) \[ = \hat{\beta} -\left(\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'}{1-h_{ii}}\right) \left( +\left[Y_i(1-h_{ii})\right] -\left[X_i\hat{\beta}\right] +\left[h_{ii}Y_i\right] \right) \]( Simplified ) \[ = \hat{\beta} -\left(\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'}{1-h_{ii}}\right) \left( Y_i-Y_ih_{ii} -X_i\hat{\beta} +h_{ii}Y_i \right) \]( Simplified ) \[ = \hat{\beta} -\left(\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'}{1-h_{ii}}\right) \left( Y_i -X_i\hat{\beta} \right) \]( \(e_i=Y_i-X_i\hat{\beta}\) ) \[ = \hat{\beta} -\left(\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'}{1-h_{ii}}\right)e_i \]( \(h_{ii}=X_i(\mathbb{X'}\mathbb{X})^{-1}X_i'\) )
\[ \large Y_i-\hat{Y}_{i(i)}=Y_i-X_i\hat{\beta_{(i)}}=Y_i-X_i\left[\hat{\beta}-\left(\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'}{1-h_{ii}}\right)e_i\right] \]( Expend ) \[ \large =Y_i-X_i\hat{\beta}-\cfrac{X_i(\mathbb{X'}\mathbb{X})^{-1}X_i'}{1-h_{ii}}e_i \] ( \(\hat{Y_{i}}=X_i\hat{\beta}\) ) \[ \large =Y_i-\hat{Y_i}-\left(\cfrac{(\mathbb{X'}\mathbb{X})^{-1}X_i'}{1-h_{ii}}\right)e_i \] ( \(e_i=Y_i-\hat{Y_i}\) ) \[ \large =e_i-\left(\cfrac{X_i(\mathbb{X'}\mathbb{X})^{-1}X_i'}{1-h_{ii}}\right)e_i \]( \(h_{ii}=X_i(\mathbb{X'}\mathbb{X})^{-1}X_i'\) ) \[ \large =e_i-\left(\cfrac{h_{ii}}{1-h_{ii}}\right)e_i \] ( Simplified ) \[ \large =\cfrac{e_i}{1-h_{ii}} \]
\[ \huge Y_i-\hat{Y}_{i(i)}=\cfrac{e_i}{1-h_{ii}} \]
##
## Call:
## lm(formula = Y ~ ., data = finProd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3753 -0.6984 -0.0450 0.6721 3.3469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.9854032 0.0093865 211.516 < 2e-16 ***
## X1 0.0085715 0.0032699 2.621 0.00877 **
## X2 -0.0009231 0.0031913 -0.289 0.77240
## X3 5.0002239 0.0033312 1501.023 < 2e-16 ***
## X4 0.0249426 0.0032588 7.654 2.10e-14 ***
## X5 -0.0025470 0.0032677 -0.779 0.43573
## X6 0.0029273 0.0033109 0.884 0.37664
## X7 -3.1044113 0.0032371 -959.010 < 2e-16 ***
## X8 7.1990570 0.0032677 2203.127 < 2e-16 ***
## X9 0.0271898 0.0031878 8.529 < 2e-16 ***
## X10 -0.0128152 0.0032669 -3.923 8.80e-05 ***
## X11 0.0068313 0.0032426 2.107 0.03516 *
## X12 -2.2036231 0.0032205 -684.249 < 2e-16 ***
## X13 0.0091387 0.0033051 2.765 0.00570 **
## X14 0.0221461 0.0032838 6.744 1.61e-11 ***
## X15 1.2838838 0.0032743 392.114 < 2e-16 ***
## X16 -0.0153460 0.0032427 -4.732 2.24e-06 ***
## X17 0.0136628 0.0033140 4.123 3.77e-05 ***
## X18 -0.0079102 0.0032584 -2.428 0.01521 *
## X19 0.0089788 0.0033655 2.668 0.00764 **
## X20 -0.0010564 0.0032502 -0.325 0.74516
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.022 on 11979 degrees of freedom
## Multiple R-squared: 0.9987, Adjusted R-squared: 0.9987
## F-statistic: 4.443e+05 on 20 and 11979 DF, p-value: < 2.2e-16
| Sum of Square | deree of freedom | Mean Square | F-test | |
|---|---|---|---|---|
| Regession | 9284797.79028282 | 20 | 464239.889514141 | 444291.054324739 |
| Error | 12516.8615986249 | 11979 | 1.04490037554261 | |
| Total | 9297314.65188144 | 11999 |
anova_info = function(dm,lm1=list(),k=1){
n = nrow(dm)
p = ncol(dm)
colnames(dm)[k] = "Y"
if(length(lm1)==0){
lm1 = lm(Y~., data = dm)
}
SSR = sum((fitted(lm1)-mean(dm$Y))^2)
SSE = sum((dm$Y-fitted(lm1))^2)
df_r = p-1
df_e = n-p
MSR = SSR/df_r
MSE = SSE/df_e
F_value = MSR/MSE
anova_fin = as.data.frame(matrix(data = c(SSR, df_r, MSR,F_value,
SSE, df_e, MSE, "",
SSR+SSE, df_r+df_e, "", ""), ncol = 4, nrow = 3, byrow = T))
colnames(anova_fin) = c("Sum of Square", "deree of freedom", "Mean Square", "F-test")
rownames(anova_fin) = c("Regession", "Error", "Total")
F_test = qf(0.95, df_r, df_e)
return(list(n=n, p=p, SSR=SSR, df_r=df_r, MSR=MSR, SSE=SSE, df_e=df_e, MSE=MSE, F_value=F_value, F_test=F_test, anova_table = anova_fin))
}
## [1] -2.116997e-17
## [1] 1.043159
## [1] 52158.88
fin_dmf = data.frame(Y = finProd$Y)
n_var = 1:20
f_valuef1 = 0
for(i in 1:20){
f_com = c()
for(j in n_var){
fin_dm = as.data.frame(cbind(fin_dmf, finProd[,j+1]))
anova_infoi = anova_info(fin_dm)
f_com = rbind(f_com, anova_infoi$F_value)
}
n_var = n_var[-which.max(f_com)]
f_valuef2 = max(f_com)
print(c(f_valuef1,f_valuef2))
if(f_valuef1<=f_valuef2){
fin_dmf = as.data.frame(cbind(fin_dmf, finProd[,which.max(f_com)+1]))
colnames(fin_dmf)[length(fin_dmf)] = colnames(finProd)[which.max(f_com)+1]
f_valuef1 = f_valuef2
}else{
break
f_value.f = f_valuef1
}
}
## [1] 522.4921
#Backward
fin_dmb = finProd
n_var = 20
f_valueb1 = 0
for(i in 1:20){
f_com = c()
for(j in 1:n_var){
fin_dm = fin_dmb[,-(j+1)]
anova_infoi = anova_info(fin_dm)
f_com = rbind(f_com, anova_infoi$F_value)
}
n_var = n_var-1
f_valueb2 = min(f_com)
print(c(f_valueb1,f_valueb2))
if(f_valueb1<=f_valueb2){
fin_dmb = fin_dmb[,-which.min(f_com)]
f_valueb1 = f_valueb2
}else{
break
f_value.b = f_valueb1
}
}
##
## Call:
## lm(formula = Y ~ X1 + X3 + X7, data = finProd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.697 -17.949 -0.532 17.506 52.398
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.68713 0.20022 8.426 < 2e-16 ***
## X1 -0.42303 0.06926 -6.108 1.04e-09 ***
## X3 5.15733 0.07051 73.142 < 2e-16 ***
## X7 -3.17461 0.06880 -46.144 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.92 on 11996 degrees of freedom
## Multiple R-squared: 0.3802, Adjusted R-squared: 0.38
## F-statistic: 2452 on 3 and 11996 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X7 + X8 + X10 + X12, data = finProd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9517 -3.2229 0.0982 2.9676 8.4138
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0919612 0.0349484 59.859 < 2e-16 ***
## X1 -0.0947299 0.0121282 -7.811 6.16e-15 ***
## X2 -0.0004375 0.0118802 -0.037 0.971
## X3 5.0019255 0.0123067 406.440 < 2e-16 ***
## X7 -3.0797764 0.0120247 -256.122 < 2e-16 ***
## X8 7.1959379 0.0121521 592.156 < 2e-16 ***
## X10 0.0183598 0.0121442 1.512 0.131
## X12 -2.1791981 0.0119916 -181.727 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.822 on 11992 degrees of freedom
## Multiple R-squared: 0.9812, Adjusted R-squared: 0.9811
## F-statistic: 8.92e+04 on 7 and 11992 DF, p-value: < 2.2e-16
Contemporary China is on the leading edge of a sexual revolution, with tremendous regional and generational differences that provide unparalleled natural experiments for analysis of the antecedents and outcomes of sexual behaviour. The Chinese Health and Family Life Study, conducted 1999-2000 as a collaborative research project of the Universities of Chicago, Beijing, and North Carolina, provides a baseline from which to anticipate and track future changes. Specifically, this study produces a baseline set of results on sexual behaviour and disease patterns, using a nationally representative probability sample. The Chinese Health and Family Life Survey sampled 60 villages and urban neighbourhoods chosen in such a way as to represent the full geographical and socioeconomic range of contemporary China excluding Hong Kong and Tibet. Eighty-three individuals were chosen at random for each location from official registers of adults aged between 20 and 64 years to target a sample of 5000 individuals in total. Here, we restrict our attention to women with current male partners for whom no information was missing, leading to a sample of 1534 women with the following variables.
##
## Call:
## lm(formula = R_income ~ ., data = chfls.f)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2696.3 -234.1 -51.0 122.0 9202.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -671.53515 702.02176 -0.957 0.338935
## R_region2 4.83814 51.50015 0.094 0.925166
## R_region3 -127.06389 65.19238 -1.949 0.051472 .
## R_region4 -62.35862 55.62069 -1.121 0.262405
## R_region5 -194.52795 55.13442 -3.528 0.000431 ***
## R_region6 -179.59610 59.51997 -3.017 0.002592 **
## R_age 5.16464 1.82664 2.827 0.004754 **
## R_edu 128.90173 19.28215 6.685 3.23e-11 ***
## R_health 30.16235 19.35735 1.558 0.119398
## R_height 1.68935 3.57914 0.472 0.636995
## R_happy 30.33336 30.45988 0.996 0.319484
## A_height -0.48564 3.42883 -0.142 0.887388
## A_edu 37.30153 18.90795 1.973 0.048700 *
## A_income 0.23330 0.01505 15.497 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 637.4 on 1517 degrees of freedom
## Multiple R-squared: 0.2841, Adjusted R-squared: 0.278
## F-statistic: 46.31 on 13 and 1517 DF, p-value: < 2.2e-16
## [1] 436.4633
chfls_in = chfls.f%>%select(4,1,2,3,5,6,7,8,9,10)
chflsf_in = data.frame(Y = chfls_in$R_income)
n_var = 1:9
f_valuef1in = 0
for(i in 1:9){
f_com = c()
for(j in n_var){
chfls_dm = as.data.frame(cbind(chflsf_in, chfls_in[,j+1]))
anova_infoi = anova_info(chfls_dm)
f_com = rbind(f_com, anova_infoi$F_value)
}
n_var = n_var[-which.max(f_com)]
f_valuef2 = max(f_com)
print(c(f_valuef1in,f_valuef2))
if(f_valuef1in<=f_valuef2){
chflsf_in = as.data.frame(cbind(chflsf_in, chfls_in[,which.max(f_com)+1]))
colnames(chflsf_in)[length(chflsf_in)] = colnames(chfls.f)[which.max(f_com)+1]
f_valuef1in = f_valuef2
}else{
break
}
}
## [1] 94.49045
#Backward
chflsb_in = chfls_in
colnames(chflsb_in)[1] = "Y"
n_var = 9
f_valueb1in = 0
for(i in 1:8){
f_com = c()
for(j in 1:n_var){
chfls_dm = chflsb_in[,-(j+1)]
anova_infoi = anova_info(chfls_dm)
f_com = rbind(f_com, anova_infoi$F_value)
}
n_var = n_var-1
f_valueb2 = min(f_com)
print(c(f_valueb1in,f_valueb2))
if(f_valueb1in<=f_valueb2){
chflsb_in = chflsb_in[,-which.min(f_com)]
f_valueb1in = f_valueb2
}else{
break
}
}
##
## Call:
## lm(formula = A_income ~ ., data = chfls.f)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5702.3 -419.1 -174.3 146.2 9064.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -228.8533 1112.8103 -0.206 0.8371
## R_region2 -57.2081 81.5991 -0.701 0.4834
## R_region3 -413.5512 102.8929 -4.019 6.13e-05 ***
## R_region4 -192.1876 88.0402 -2.183 0.0292 *
## R_region5 -399.2709 87.1280 -4.583 4.97e-06 ***
## R_region6 -457.7355 93.8708 -4.876 1.19e-06 ***
## R_age -12.9700 2.8831 -4.499 7.36e-06 ***
## R_edu 65.0357 30.9581 2.101 0.0358 *
## R_income 0.5859 0.0378 15.497 < 2e-16 ***
## R_health -1.3054 30.7000 -0.043 0.9661
## R_height 1.3437 5.6722 0.237 0.8128
## R_happy 63.5725 48.2578 1.317 0.1879
## A_height 2.6503 5.4333 0.488 0.6258
## A_edu 145.6031 29.7679 4.891 1.11e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1010 on 1517 degrees of freedom
## Multiple R-squared: 0.2937, Adjusted R-squared: 0.2877
## F-statistic: 48.53 on 13 and 1517 DF, p-value: < 2.2e-16
## [1] 436.4633
chfls_ain = chfls.f%>%select(10,1,2,3,4,5,6,7,8,9,10)
chflsf_ain = data.frame(Y = chfls_ain$A_income)
n_var = 1:9
f_valuef1ain = 0
for(i in 1:9){
f_com = c()
for(j in n_var){
chfls_dm = as.data.frame(cbind(chflsf_ain, chfls_ain[,j+1]))
anova_infoi = anova_info(chfls_dm)
f_com = rbind(f_com, anova_infoi$F_value)
}
n_var = n_var[-which.max(f_com)]
f_valuef2 = max(f_com)
print(c(f_valuef1ain,f_valuef2))
if(f_valuef1ain<=f_valuef2){
chflsf_ain = as.data.frame(cbind(chflsf_ain, chfls_ain[,which.max(f_com)+1]))
colnames(chflsf_ain)[length(chflsf_ain)] = colnames(chfls.f)[which.max(f_com)+1]
f_valuef1ain = f_valuef2
}else{
break
}
}
## [1] 47.59401
chflsb_ain = chfls_ain
colnames(chflsb_ain)[1] = "Y"
n_var = 9
f_valueb1ain = 0
for(i in 1:8){
f_com = c()
for(j in 1:n_var){
chfls_dm = chflsb_ain[,-(j+1)]
anova_infoi = anova_info(chfls_dm)
f_com = rbind(f_com, anova_infoi$F_value)
}
n_var = n_var-1
f_valueb2 = min(f_com)
print(c(f_valueb1ain,f_valueb2))
if(f_valueb1ain<=f_valueb2){
chflsb_ain = chflsb_ain[,-which.min(f_com)]
f_valueb1ain = f_valueb2
}else{
break
}
}
##
## Call:
## lm(formula = R_happy ~ ., data = chfls.f)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0925 -0.2223 -0.0116 0.2233 1.6100
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.445e+00 5.906e-01 2.446 0.014549 *
## R_region2 -4.399e-02 4.338e-02 -1.014 0.310675
## R_region3 -1.278e-01 5.490e-02 -2.328 0.020050 *
## R_region4 -5.385e-02 4.687e-02 -1.149 0.250753
## R_region5 -1.573e-01 4.647e-02 -3.384 0.000731 ***
## R_region6 -1.623e-01 5.013e-02 -3.237 0.001236 **
## R_age 3.387e-03 1.541e-03 2.198 0.028083 *
## R_edu 3.690e-03 1.649e-02 0.224 0.822917
## R_income 2.154e-05 2.163e-05 0.996 0.319484
## R_health 2.295e-01 1.522e-02 15.079 < 2e-16 ***
## R_height 6.437e-03 3.012e-03 2.137 0.032734 *
## A_height -2.157e-03 2.889e-03 -0.747 0.455343
## A_edu -1.219e-03 1.595e-02 -0.076 0.939096
## A_income 1.797e-05 1.364e-05 1.317 0.187919
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5371 on 1517 degrees of freedom
## Multiple R-squared: 0.1536, Adjusted R-squared: 0.1464
## F-statistic: 21.18 on 13 and 1517 DF, p-value: < 2.2e-16
## [1] 232.429
chflsY = chfls.f%>%select(7,1,2,3,4,5,6,8,9,10)
chfls_dmf = data.frame(Y = chflsY$R_happy)
n_var = 1:9
f_valuef1h = 0
for(i in 1:9){
f_com = c()
for(j in n_var){
chfls_dm = as.data.frame(cbind(chfls_dmf, chflsY[,j+1]))
anova_infoi = anova_info(chfls_dm)
f_com = rbind(f_com, anova_infoi$F_value)
}
n_var = n_var[-which.max(f_com)]
f_valuef2 = max(f_com)
print(c(f_valuef1h,f_valuef2))
if(f_valuef1h<=f_valuef2){
chfls_dmf = as.data.frame(cbind(chfls_dmf, chflsY[,which.max(f_com)+1]))
colnames(chfls_dmf)[length(chfls_dmf)] = colnames(chfls.f)[which.max(f_com)+1]
f_valuef1h = f_valuef2
}else{
break
}
}
## [1] 7.680299
chflsb = chflsY
n_var = 9
f_valueb1h = 0
for(i in 1:9){
f_com = c()
for(j in 1:n_var){
chfls_dm = chflsb[,-(j+1)]
anova_infoi = anova_info(chfls_dm)
f_com = rbind(f_com, anova_infoi$F_value)
}
n_var = n_var-1
f_valueb2 = min(f_com)
print(c(f_valueb1h,f_valueb2))
if(f_valueb1h<=f_valueb2){
chflsb = chflsb[,-which.min(f_com)]
f_valueb1h = f_valueb2
}else{
break
}
}
rm(list = ls())